Improve Utils.normalize_uri (#1719)

* Improve Utils.normalize_uri

Globally replacing generally unsafe characters in a URL would not fix
invalid authorities and paths, so use Addressable::URI to normalize them
when necessary.

This should fix #1701.

* Remove an unused function

* Fix the test case to make sure an IPv6 address is supported

Akinori MUSHA 7 years ago
parent
commit
7e79d576b5
3 changed files with 33 additions and 8 deletions
  1. 24 5
      lib/utils.rb
  2. 2 1
      spec/data_fixtures/urlTest.html
  3. 7 2
      spec/models/agents/website_agent_spec.rb

+ 24 - 5
lib/utils.rb

@@ -1,5 +1,6 @@
1 1
 require 'jsonpath'
2 2
 require 'cgi'
3
+require 'addressable/uri'
3 4
 
4 5
 module Utils
5 6
   def self.unindent(s)
@@ -25,11 +26,29 @@ module Utils
25 26
     begin
26 27
       URI(uri)
27 28
     rescue URI::Error
28
-      URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
29
-            unsafe.bytes.each_with_object(String.new) { |uc, s|
30
-              s << sprintf('%%%02X', uc)
31
-            }
32
-          }.force_encoding(Encoding::US_ASCII))
29
+      begin
30
+        URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
31
+              unsafe.bytes.each_with_object(String.new) { |uc, s|
32
+                s << sprintf('%%%02X', uc)
33
+              }
34
+            }.force_encoding(Encoding::US_ASCII))
35
+      rescue URI::Error => e
36
+        begin
37
+          auri = Addressable::URI.parse(uri.to_s)
38
+        rescue
39
+          # Do not leak Addressable::URI::InvalidURIError which
40
+          # callers might not expect.
41
+          raise e
42
+        else
43
+          # Addressable::URI#normalize! modifies the query and
44
+          # fragment components beyond escaping unsafe characters, so
45
+          # avoid using it.  Otherwise `?a[]=%2F` would be normalized
46
+          # as `?a%5B%5D=/`, for example.
47
+          auri.site = auri.normalized_site
48
+          auri.path = auri.normalized_path
49
+          URI(auri.to_s)
50
+        end
51
+      end
33 52
     end
34 53
   end
35 54
 

+ 2 - 1
spec/data_fixtures/urlTest.html

@@ -12,6 +12,7 @@
12 12
             <li><a href="https://www.google.ca/search?q=위키백과:대문">unicode param</a></li>
13 13
             <li><a href="http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded url</a></li>
14 14
             <li><a href="https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded param</a></li>
15
+            <li><a href="http://[::1]/path[]?query[]=foo">brackets</a></li>
15 16
         </ul>
16 17
     </body>
17
-</html>
18
+</html>

+ 7 - 2
spec/models/agents/website_agent_spec.rb

@@ -1105,8 +1105,8 @@ fire: hot
1105 1105
 
1106 1106
     describe "#check" do
1107 1107
       before do
1108
-        expect { @checker.check }.to change { Event.count }.by(7)
1109
-        @events = Event.last(7)
1108
+        expect { @checker.check }.to change { Event.count }.by(8)
1109
+        @events = Event.last(8)
1110 1110
       end
1111 1111
 
1112 1112
       it "should check hostname" do
@@ -1143,6 +1143,11 @@ fire: hot
1143 1143
         event = @events[6]
1144 1144
         expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
1145 1145
       end
1146
+
1147
+      it "should check url with unescaped brackets in the path component" do
1148
+        event = @events[7]
1149
+        expect(event.payload['url']).to eq("http://[::1]/path%5B%5D?query[]=foo")
1150
+      end
1146 1151
     end
1147 1152
   end
1148 1153
 end